Descriptive statistics

rm(list = ls())  # Clear workspace

library(tidyverse)
library(readxl)
library(janitor)
library(skimr)

Data Load: set the data path and view the names of all sheets from the EXCEL table.

data_path = "../data/ai_interview_dataset_1129.xlsx"

sheets <- excel_sheets(data_path)
print("Sheets in the dataset:")
## [1] "Sheets in the dataset:"
sheets           # List all sheet names
## [1] "users"                "sessions"             "technical_questions" 
## [4] "behavioral_questions" "system_events"        "timeline"

Read each sheet from the excel file:

data_list <- list()
for (sheet in sheets) {
  data_list[[sheet]] <- read_excel(data_path, sheet = sheet)
}

Dataset Overview: summarises the basic structure of each dataset, including the number of rows, columns, variable names, and missing rate.

# Overview of each sheet
overview <- list()
for (sheet in sheets) {
    df <- data_list[[sheet]]
    overview[[sheet]] <- tibble(
        rows = nrow(df),
        cols = ncol(df),
        col_names = paste(names(df), collapse = ", "),
        missing_rate = sum(is.na(df)) / (nrow(df) * ncol(df))
)
}
print("Sheet overview:")

[1] “Sheet overview:”

overview

$users # A tibble: 1 × 4 rows cols col_names missing_rate 1 1129 23 user_id, track, mbti, timezone, device_type, highest… 0

$sessions # A tibble: 1 × 4 rows cols col_names missing_rate 1 1129 15 session_id, user_id, session_start_ts, session_end_t… 0

$technical_questions # A tibble: 1 × 4 rows cols col_names missing_rate 1 3387 23 tech_question_id, session_id, order_in_session, ques… 0.180

$behavioral_questions # A tibble: 1 × 4 rows cols col_names missing_rate 1 3387 22 behavioral_question_id, session_id, order_in_session… 0

$system_events # A tibble: 1 × 4 rows cols col_names missing_rate 1 2885 5 event_id, session_id, event_ts, event_type, payload_… 0

$timeline # A tibble: 1 × 4 rows cols col_names missing_rate 1 2258 5 timeline_id, user_id, session_id, event_ts, event_ty… 0 From the overview, we can see that the general size, variable name and completeness of each table. And we can see that the missing rate of the “technical_questions” table is 0.18, which indicates that this table has missing values.

Numeric Variable Summary: select only numeric variables and analyse numeric variables using descriptive statistics.

summary_list <- list()
for (sheet in sheets) {
  df <- data_list[[sheet]]
  
  # Select only numeric columns for summary
  numeric_df <- df %>% select(where(is.numeric)) 
  # Print sheet name as a header
  cat("\n## Sheet:", sheet, "\n")   
  # Generates information such as mean, median, standard deviation, minimum, maximum, and missing values.
  if (ncol(numeric_df) > 0) {
    summary_list[[sheet]] <- skim(numeric_df)
    print(summary_list[[sheet]])
  }
  else{
    cat("No numeric columns to summarize in this sheet.\n")
  }
}
## 
## ## Sheet: users 
## ── Data Summary ────────────────────────
##                            Values    
## Name                       numeric_df
## Number of rows             1129      
## Number of columns          15        
## _______________________              
## Column type frequency:               
##   numeric                  15        
## ________________________             
## Group variables            None      
## 
## ── Variable type: numeric ──────────────────────────────────────────────────────
##    skim_variable                 n_missing complete_rate     mean      sd     p0
##  1 stem_degree_flag                      0             1    0.961   0.194    0  
##  2 graduation_year                       0             1 2023.      1.10  2019  
##  3 total_experience_years                0             1    2.83    1.51     0  
##  4 relevant_experience_years             0             1    1.92    1.16     0  
##  5 internship_count                      0             1    2.02    1.12     0  
##  6 project_count                         0             1    2.49    2.06     1  
##  7 leadership_experience_flag            0             1    0.389   0.488    0  
##  8 resume_experience_score               0             1   83.3    13.6     41  
##  9 skill_match_score                     0             1   82.3    14.2     34  
## 10 experience_depth_score                0             1    4.13    1.13     1  
## 11 experience_consistency_flag           0             1    0.919   0.272    0  
## 12 resume_gap_detected                   0             1    0.123   0.329    0  
## 13 resume_review_time_sec                0             1  198.     59.3     60  
## 14 resume_feedback_length_tokens         0             1  342.    141.      80  
## 15 overall_resume                        0             1   76.7     8.85    46.8
##        p25     p50     p75    p100 hist 
##  1    1       1       1       1    ▁▁▁▁▇
##  2 2023    2023    2024    2025    ▁▁▂▇▇
##  3    1.83    2.58    3.42    8.5  ▃▇▃▁▁
##  4    1.17    1.75    2.42    7.83 ▇▇▂▁▁
##  5    1       2       3       5    ▇▇▅▂▁
##  6    1       2       3      12    ▇▁▁▁▁
##  7    0       0       1       1    ▇▁▁▁▅
##  8   74      85      96      99    ▁▂▃▆▇
##  9   73      84      95      99    ▁▁▃▆▇
## 10    3       5       5       5    ▁▂▂▃▇
## 11    1       1       1       1    ▁▁▁▁▇
## 12    0       0       0       1    ▇▁▁▁▁
## 13  158     198     237     405    ▃▇▇▂▁
## 14  241     336     434     766    ▅▇▇▃▁
## 15   71      78      83.8    90.5  ▁▂▅▇▇
## 
## ## Sheet: sessions 
## ── Data Summary ────────────────────────
##                            Values    
## Name                       numeric_df
## Number of rows             1129      
## Number of columns          7         
## _______________________              
## Column type frequency:               
##   numeric                  7         
## ________________________             
## Group variables            None      
## 
## ── Variable type: numeric ──────────────────────────────────────────────────────
##   skim_variable             n_missing complete_rate    mean     sd  p0 p25 p50
## 1 overall_tech_score                0             1  72.2   16.6    29  57  77
## 2 weighted_behavior_score           0             1  75.5    7.75   48  70  75
## 3 pass_flag                         0             1   0.671  0.470   0   0   1
## 4 confidence_change_overall         0             1  -0.116  4.97  -10  -4   0
## 5 latency_ms_p50                    0             1  43.5   10.3    24  36  43
## 6 latency_ms_p95                    0             1 113.    21.6    65  98 111
## 7 messages_exchanged                0             1  49.6    9.79   29  42  50
##   p75 p100 hist 
## 1  85  100 ▁▅▃▇▆
## 2  81   96 ▁▃▇▇▂
## 3   1    1 ▃▁▁▁▇
## 4   3   10 ▅▇▇▆▃
## 5  51   68 ▅▇▇▅▂
## 6 127  176 ▂▇▇▃▁
## 7  57   70 ▅▇▇▇▅
## 
## ## Sheet: technical_questions 
## ── Data Summary ────────────────────────
##                            Values    
## Name                       numeric_df
## Number of rows             3387      
## Number of columns          8         
## _______________________              
## Column type frequency:               
##   numeric                  8         
## ________________________             
## Group variables            None      
## 
## ── Variable type: numeric ──────────────────────────────────────────────────────
##   skim_variable      n_missing complete_rate    mean      sd p0  p25 p50 p75
## 1 order_in_session           0         1       2       0.817  1   1    2   3
## 2 time_to_answer_sec         0         1     519.    277.    70 312. 474 683
## 3 question_score             0         1      66.3    21.1   13  47   74  83
## 4 correctness_flag           0         1       0.610   0.488  0   0    1   1
## 5 followup_turns             0         1       3.01    1.44   1   2    3   4
## 6 analysis_depth             0         1       3.29    1.26   1   2    3   4
## 7 overthinking_flag          0         1       0.122   0.328  0   0    0   0
## 8 statistical_depth       2574         0.240   3.01    0.642  2   3    3   3
##   p100 hist 
## 1    3 ▇▁▇▁▇
## 2 1409 ▆▇▃▂▁
## 3  100 ▁▅▃▇▆
## 4    1 ▅▁▁▁▇
## 5    6 ▇▅▅▂▂
## 6    6 ▇▇▇▃▁
## 7    1 ▇▁▁▁▁
## 8    4 ▃▁▇▁▃
## 
## ## Sheet: behavioral_questions 
## ── Data Summary ────────────────────────
##                            Values    
## Name                       numeric_df
## Number of rows             3387      
## Number of columns          17        
## _______________________              
## Column type frequency:               
##   numeric                  17        
## ________________________             
## Group variables            None      
## 
## ── Variable type: numeric ──────────────────────────────────────────────────────
##    skim_variable                      n_missing complete_rate     mean     sd
##  1 order_in_session                           0             1   2       0.817
##  2 response_time_sec                          0             1 238.     42.3  
##  3 response_length_tokens                     0             1 197.     44.3  
##  4 response_sentiment_score                   0             1   0.0793  0.259
##  5 hesitation_flag                            0             1   0.244   0.429
##  6 followup_depth                             0             1   1.49    1.12 
##  7 communication_clarity_score                0             1   3.70    0.975
##  8 teamwork_score                             0             1   3.69    0.967
##  9 conflict_resolution_score                  0             1   3.56    0.967
## 10 leadership_signal_score                    0             1   3.42    1.02 
## 11 problem_structuring_score                  0             1   3.65    0.988
## 12 professionalism_score                      0             1   3.82    0.945
## 13 behavioral_strength_flag                   0             1   0.535   0.499
## 14 behavioral_risk_flag                       0             1   0.0466  0.211
## 15 overall_behavioral_score_component         0             1  75.5    12.2  
## 16 weighted_behavior_score                    0             1  75.5     7.74 
## 17 confidence_change_behavioral               0             1   0.243   2.69 
##       p0    p25    p50   p75  p100 hist 
##  1   1     1      2      3     3   ▇▁▇▁▇
##  2 160   203    239    273   337   ▆▇▇▇▂
##  3 120   160    196    233   300   ▇▇▇▇▂
##  4  -0.4  -0.13   0.08   0.3   0.6 ▆▇▇▇▅
##  5   0     0      0      0     1   ▇▁▁▁▂
##  6   0     0      1      3     3   ▇▇▁▇▇
##  7   1     3      4      4     5   ▁▂▆▇▅
##  8   1     3      4      4     5   ▁▂▆▇▅
##  9   1     3      4      4     5   ▁▂▇▇▃
## 10   1     3      3      4     5   ▁▃▇▇▃
## 11   1     3      4      4     5   ▁▂▆▇▅
## 12   1     3      4      5     5   ▁▂▆▇▆
## 13   0     0      1      1     1   ▇▁▁▁▇
## 14   0     0      0      0     1   ▇▁▁▁▁
## 15  33    67     76     84    98   ▁▂▆▇▅
## 16  48    70     75     81    96   ▁▃▇▇▂
## 17  -4    -2      0      2     6   ▇▇▇▅▂
## 
## ## Sheet: system_events 
## No numeric columns to summarize in this sheet.
## 
## ## Sheet: timeline 
## No numeric columns to summarize in this sheet.

The results include mean, median, standard deviation, minimum, maximum, and missing value information for each numeric variable.

Suspicious points: Sheet users 1. From the chart, we can see that the highest value of total_experience_years is 8.5. But the codebook specifies that the upper limit for the early-career focus is 5 years. 2. The project_count is at its maximum of 12, with an average of 2.5, which is above the normal range. 3. The maximum value of resume_feedback_length_tokens is 766. We can conduct a visual analysis of outliers to determine if it is reasonable.

Sheet technical_questions 1. The percentage of statistical_depth’s missing values is extremely high. But the codebook indicates that [DS-SPECIFIC COLUMNS] (NULL for SDE track). Therefore, this absence might be reasonable due to the issue with the SDE track, and it is not a data error. 2. The highest value of time_to_answer_sec is 1409. But the codebook specifies that the range of this is from 15s to 360s.

clean_for_plot <- function(df, x, y = NULL){
  # This function filters out rows with non-finite values in the specified columns for plotting.
  if (is.null(y)) {
    df %>% filter( is.finite(.data[[x]]))  # Filter out rows where the x column has non-finite values (NA, Inf, -Inf)
  } else {
    df %>% filter( is.finite(.data[[x]]), is.finite(.data[[y]]))   # Filter out rows where either x or y column has non-finite values
  }
}

# Data cleaning
cleaned_users <- data_list[["users"]] %>%
  mutate(total_experience_years = ifelse(total_experience_years > 5, 5, total_experience_years))   # Cap experience at 5 years (upper limit is 5 years)
data_list[["users"]] <- cleaned_users

cleaned_technical_questions <- data_list[["technical_questions"]] %>%
  mutate(time_to_answer_sec = ifelse(time_to_answer_sec > 360, 360, time_to_answer_sec))           # Cap time at 360 seconds (upper limit is 360 seconds)
data_list[["technical_questions"]] <- cleaned_technical_questions

# After cleaning, we can check the summary statistics again to see the effect of capping
summary_list_clean <- list()
for (sheet in sheets) {
  df <- data_list[[sheet]]
  numeric_df <- df %>% select(where(is.numeric)) 
  cat("\n## Sheet:", sheet, "after cleaning\n")   
  if (ncol(numeric_df) > 0) {
    summary_list_clean[[sheet]] <- skim(numeric_df)
    print(summary_list_clean[[sheet]])
  }
  else{
    cat("No numeric columns to summarize in this sheet.\n")
  }
}
## 
## ## Sheet: users after cleaning
## ── Data Summary ────────────────────────
##                            Values    
## Name                       numeric_df
## Number of rows             1129      
## Number of columns          15        
## _______________________              
## Column type frequency:               
##   numeric                  15        
## ________________________             
## Group variables            None      
## 
## ── Variable type: numeric ──────────────────────────────────────────────────────
##    skim_variable                 n_missing complete_rate     mean      sd     p0
##  1 stem_degree_flag                      0             1    0.961   0.194    0  
##  2 graduation_year                       0             1 2023.      1.10  2019  
##  3 total_experience_years                0             1    2.71    1.23     0  
##  4 relevant_experience_years             0             1    1.92    1.16     0  
##  5 internship_count                      0             1    2.02    1.12     0  
##  6 project_count                         0             1    2.49    2.06     1  
##  7 leadership_experience_flag            0             1    0.389   0.488    0  
##  8 resume_experience_score               0             1   83.3    13.6     41  
##  9 skill_match_score                     0             1   82.3    14.2     34  
## 10 experience_depth_score                0             1    4.13    1.13     1  
## 11 experience_consistency_flag           0             1    0.919   0.272    0  
## 12 resume_gap_detected                   0             1    0.123   0.329    0  
## 13 resume_review_time_sec                0             1  198.     59.3     60  
## 14 resume_feedback_length_tokens         0             1  342.    141.      80  
## 15 overall_resume                        0             1   76.7     8.85    46.8
##        p25     p50     p75    p100 hist 
##  1    1       1       1       1    ▁▁▁▁▇
##  2 2023    2023    2024    2025    ▁▁▂▇▇
##  3    1.83    2.58    3.42    5    ▂▆▇▅▅
##  4    1.17    1.75    2.42    7.83 ▇▇▂▁▁
##  5    1       2       3       5    ▇▇▅▂▁
##  6    1       2       3      12    ▇▁▁▁▁
##  7    0       0       1       1    ▇▁▁▁▅
##  8   74      85      96      99    ▁▂▃▆▇
##  9   73      84      95      99    ▁▁▃▆▇
## 10    3       5       5       5    ▁▂▂▃▇
## 11    1       1       1       1    ▁▁▁▁▇
## 12    0       0       0       1    ▇▁▁▁▁
## 13  158     198     237     405    ▃▇▇▂▁
## 14  241     336     434     766    ▅▇▇▃▁
## 15   71      78      83.8    90.5  ▁▂▅▇▇
## 
## ## Sheet: sessions after cleaning
## ── Data Summary ────────────────────────
##                            Values    
## Name                       numeric_df
## Number of rows             1129      
## Number of columns          7         
## _______________________              
## Column type frequency:               
##   numeric                  7         
## ________________________             
## Group variables            None      
## 
## ── Variable type: numeric ──────────────────────────────────────────────────────
##   skim_variable             n_missing complete_rate    mean     sd  p0 p25 p50
## 1 overall_tech_score                0             1  72.2   16.6    29  57  77
## 2 weighted_behavior_score           0             1  75.5    7.75   48  70  75
## 3 pass_flag                         0             1   0.671  0.470   0   0   1
## 4 confidence_change_overall         0             1  -0.116  4.97  -10  -4   0
## 5 latency_ms_p50                    0             1  43.5   10.3    24  36  43
## 6 latency_ms_p95                    0             1 113.    21.6    65  98 111
## 7 messages_exchanged                0             1  49.6    9.79   29  42  50
##   p75 p100 hist 
## 1  85  100 ▁▅▃▇▆
## 2  81   96 ▁▃▇▇▂
## 3   1    1 ▃▁▁▁▇
## 4   3   10 ▅▇▇▆▃
## 5  51   68 ▅▇▇▅▂
## 6 127  176 ▂▇▇▃▁
## 7  57   70 ▅▇▇▇▅
## 
## ## Sheet: technical_questions after cleaning
## ── Data Summary ────────────────────────
##                            Values    
## Name                       numeric_df
## Number of rows             3387      
## Number of columns          8         
## _______________________              
## Column type frequency:               
##   numeric                  8         
## ________________________             
## Group variables            None      
## 
## ── Variable type: numeric ──────────────────────────────────────────────────────
##   skim_variable      n_missing complete_rate    mean     sd p0  p25 p50 p75 p100
## 1 order_in_session           0         1       2      0.817  1   1    2   3    3
## 2 time_to_answer_sec         0         1     319.    77.6   70 312. 360 360  360
## 3 question_score             0         1      66.3   21.1   13  47   74  83  100
## 4 correctness_flag           0         1       0.610  0.488  0   0    1   1    1
## 5 followup_turns             0         1       3.01   1.44   1   2    3   4    6
## 6 analysis_depth             0         1       3.29   1.26   1   2    3   4    6
## 7 overthinking_flag          0         1       0.122  0.328  0   0    0   0    1
## 8 statistical_depth       2574         0.240   3.01   0.642  2   3    3   3    4
##   hist 
## 1 ▇▁▇▁▇
## 2 ▁▁▁▁▇
## 3 ▁▅▃▇▆
## 4 ▅▁▁▁▇
## 5 ▇▅▅▂▂
## 6 ▇▇▇▃▁
## 7 ▇▁▁▁▁
## 8 ▃▁▇▁▃
## 
## ## Sheet: behavioral_questions after cleaning
## ── Data Summary ────────────────────────
##                            Values    
## Name                       numeric_df
## Number of rows             3387      
## Number of columns          17        
## _______________________              
## Column type frequency:               
##   numeric                  17        
## ________________________             
## Group variables            None      
## 
## ── Variable type: numeric ──────────────────────────────────────────────────────
##    skim_variable                      n_missing complete_rate     mean     sd
##  1 order_in_session                           0             1   2       0.817
##  2 response_time_sec                          0             1 238.     42.3  
##  3 response_length_tokens                     0             1 197.     44.3  
##  4 response_sentiment_score                   0             1   0.0793  0.259
##  5 hesitation_flag                            0             1   0.244   0.429
##  6 followup_depth                             0             1   1.49    1.12 
##  7 communication_clarity_score                0             1   3.70    0.975
##  8 teamwork_score                             0             1   3.69    0.967
##  9 conflict_resolution_score                  0             1   3.56    0.967
## 10 leadership_signal_score                    0             1   3.42    1.02 
## 11 problem_structuring_score                  0             1   3.65    0.988
## 12 professionalism_score                      0             1   3.82    0.945
## 13 behavioral_strength_flag                   0             1   0.535   0.499
## 14 behavioral_risk_flag                       0             1   0.0466  0.211
## 15 overall_behavioral_score_component         0             1  75.5    12.2  
## 16 weighted_behavior_score                    0             1  75.5     7.74 
## 17 confidence_change_behavioral               0             1   0.243   2.69 
##       p0    p25    p50   p75  p100 hist 
##  1   1     1      2      3     3   ▇▁▇▁▇
##  2 160   203    239    273   337   ▆▇▇▇▂
##  3 120   160    196    233   300   ▇▇▇▇▂
##  4  -0.4  -0.13   0.08   0.3   0.6 ▆▇▇▇▅
##  5   0     0      0      0     1   ▇▁▁▁▂
##  6   0     0      1      3     3   ▇▇▁▇▇
##  7   1     3      4      4     5   ▁▂▆▇▅
##  8   1     3      4      4     5   ▁▂▆▇▅
##  9   1     3      4      4     5   ▁▂▇▇▃
## 10   1     3      3      4     5   ▁▃▇▇▃
## 11   1     3      4      4     5   ▁▂▆▇▅
## 12   1     3      4      5     5   ▁▂▆▇▆
## 13   0     0      1      1     1   ▇▁▁▁▇
## 14   0     0      0      0     1   ▇▁▁▁▁
## 15  33    67     76     84    98   ▁▂▆▇▅
## 16  48    70     75     81    96   ▁▃▇▇▂
## 17  -4    -2      0      2     6   ▇▇▇▅▂
## 
## ## Sheet: system_events after cleaning
## No numeric columns to summarize in this sheet.
## 
## ## Sheet: timeline after cleaning
## No numeric columns to summarize in this sheet.

Data visualization

Create Image: Create histograms for each numeric column in each sheet

# Create histograms for each numeric column in each sheet
for (sheet in sheets) {
  df <- data_list[[sheet]]
  numeric_df <- df %>% select(where(is.numeric))
  if (ncol(numeric_df) > 0) {
    for (col_name in names(numeric_df)){
      df2 <- clean_for_plot(df, col_name)  # Clean data for plotting
      p.hist <- ggplot(df2, aes(x = .data[[col_name]])) + 
        geom_histogram(binwidth = 1, fill = "blue", color = "black") +
        labs(title = paste("Histogram of", col_name, "in sheet", sheet), x = col_name, y = "Count")
      print(p.hist)
    }
  } }

# Create boxplots for each numeric column in each sheet
for (sheet in sheets) {
  df <- data_list[[sheet]]
  numeric_df <- df %>% select(where(is.numeric))
  
  # Create boxplots for each numeric column in the sheet (without grouping)
  if (ncol(numeric_df) > 0) {
    for (col_name in names(numeric_df)){
      df2 <- clean_for_plot(df, col_name)  # Clean data for plotting
      p.box.single <- ggplot(df2, aes(x = factor(1), y = .data[[col_name]])) + 
        geom_boxplot(outlier.colour = "purple", fill = "blue", color = "black") +
        stat_summary(fun = median, geom = "point", shape = 20, size = 3, color = "red") +   # Add median point
        stat_summary(fun = mean, geom = "point", shape = 20, size = 3, color = "green") +   # Add mean point
        labs(title = paste("Boxplot of", col_name, "in sheet", sheet), x = "", y = col_name)
      print(p.box.single)
    }
  }
}

# Create boxplots for each numeric column grouped by "track" if "track" column exists
for (sheet in sheets) {
  df <- data_list[[sheet]]
  numeric_df <- df %>% select(where(is.numeric))
  
  # Check if "track" column exists and there are numeric columns to plot
  if ("track" %in% names(df) & ncol(numeric_df) > 0) {
    # Convert "track" to factor for better plotting
    df$track <- as.factor(df$track)        # Making sure "track" is treated as a categorical variable
    for (col_name in names(numeric_df)){
      df2 <- clean_for_plot(df, col_name)  # Clean data for plotting
      df2 <- df2 %>% filter(!is.na(track))  # Filter out rows where "track" is NA
      p.box.track <- ggplot(df2, aes(x = track, y = .data[[col_name]])) +
        geom_boxplot(aes(fill = track), outlier.colour = "purple") +
        stat_summary(fun = median, geom = "point", shape = 20, size = 3, color = "red") +   # Add median point
        stat_summary(fun = mean, geom = "point", shape = 20, size = 3, color = "green") +   # Add mean point
        labs(title = paste("Boxplot of", col_name, "by track in sheet", sheet), x = "Track",y = col_name)
      print(p.box.track)
    }
  } 
}

# Create boxplots for each numeric column grouped by major groups if they exist
major_groups <- c("track", "device_type", "timezone") # Define major groups for analysis
for (sheet in sheets) {
  df <- data_list[[sheet]]
  numeric_df <- df %>% select(where(is.numeric))
  
  # Create boxplots for each numeric column grouped by major groups if they exist
  if (ncol(numeric_df) > 0) {   # Only create boxplots if there are numeric columns to plot
    for (col_name  in names(numeric_df)) { 
      # Loop through each major group and create boxplots if the group column exists in the dataframe 
      for (group in major_groups) {
        if (group %in% names(df)) {
          df2 <- clean_for_plot(df, col_name)  # Clean data for plotting
          df2 <- df2 %>% filter(!is.na(.data[[group]]))  # Filter out rows where the group variable is NA
          
          p.box.group <- ggplot(df2, aes(x = .data[[group]], y = .data[[col_name]])) +
            geom_boxplot(aes(fill = .data[[group]]), outlier.colour = "purple") +
            stat_summary(fun = median, geom = "point", shape = 20, size = 3, color = "red") +   # Add median point
            stat_summary(fun = mean, geom = "point", shape = 20, size = 3, color = "green") +   # Add mean point
            labs(title = paste("Boxplot of", col_name, "by", group, "in sheet", sheet), x = group, y = col_name)
          print(p.box.group)
        }
      }
    }
  }
}

# Create scatter plots for each pair of numeric columns in each sheet, colored by "track" if it exists, and calculate correlation
for (sheet in sheets) {
  df <- data_list[[sheet]]
  numeric_df <- df %>% select(where(is.numeric))
  
  if (ncol(numeric_df) > 1) {  # Only create scatter plots if there are at least 2 numeric columns
    numeric_cols <- names(numeric_df)
    
    for (i in 1:(length(numeric_cols)-1)) {
      for (j in (i+1):length(numeric_cols)) {
        x <- numeric_cols[i]
        y <- numeric_cols[j]
        
        df2 <- clean_for_plot(df, x, y)  # Clean data for plotting
        cor_val <- cor(df2[[x]], df2[[y]], use = "complete.obs")  # Calculate correlation, excluding NA values
        
        if ("track" %in% names(df2)) {
          df2 <- df2 %>% filter(!is.na(track))  # Filter out rows where "track" is NA
          df2$track <- as.factor(df2$track)     # Ensure "track" is treated as a categorical variable
          
          if (length(levels(df2$track)) > 1) {
            p.scatter <- ggplot(df2, aes(x = .data[[x]], y = .data[[y]])) +
              geom_point(aes(color = track)) +
              geom_smooth(method = "lm", se = FALSE) +  # Add linear regression line
              labs(color = "Track", title = paste("Scatter plot of", x, "vs", y, "in sheet", sheet, "\nCorrelation:", round(cor_val, 2)),
                   x = x, y = y
              )
          } else {
            p.scatter <- ggplot(df2, aes(x = .data[[x]], y = .data[[y]])) +
              geom_point() +
              geom_smooth(method = "lm", se = FALSE) +  # Add linear regression line
              labs(color = "Device Type",
                   title = paste("Scatter plot of", x, "vs", y, "by device_type in sheet", sheet,
                                 "\nCorrelation:", round(cor_val, 2)),
                   x = x, y = y)
          }
          print(p.scatter)
        }
      }
    }
  }
}

# Scatter plots colored by device_type
for (sheet in sheets) {
  df <- data_list[[sheet]]
  numeric_df <- df %>% select(where(is.numeric))
  
  if (ncol(numeric_df) > 1) {  
    numeric_cols <- names(numeric_df)
    for (i in 1:(length(numeric_cols)-1)) {
      for (j in (i+1):length(numeric_cols)) {
        x <- numeric_cols[i]
        y <- numeric_cols[j]
        df2 <- clean_for_plot(df, x, y)
        
        if ("device_type" %in% names(df2)) {
          df2 <- df2 %>% filter(!is.na(device_type))
          df2$device_type <- as.factor(df2$device_type)
          
          if (length(levels(df2$device_type)) > 1) {
            cor_val <- cor(df2[[x]], df2[[y]], use = "complete.obs")
            p.scatter <- ggplot(df2, aes(x = .data[[x]], y = .data[[y]])) +
              geom_point(aes(color = device_type)) +
              geom_smooth(method = "lm", se = FALSE) +
              labs(color = "Device Type",
                   title = paste("Scatter plot of", x, "vs", y, "by device_type in sheet", sheet,
                                 "\nCorrelation:", round(cor_val, 2)),
                   x = x, y = y)
            print(p.scatter)
          }
        }
      }
    }
  }
}

# Scatter plots colored by timezone
for (sheet in sheets) {
  df <- data_list[[sheet]]
  numeric_df <- df %>% select(where(is.numeric))
  
  if (ncol(numeric_df) > 1) {  
    numeric_cols <- names(numeric_df)
    for (i in 1:(length(numeric_cols)-1)) {
      for (j in (i+1):length(numeric_cols)) {
        x <- numeric_cols[i]
        y <- numeric_cols[j]
        df2 <- clean_for_plot(df, x, y)
        
        if ("timezone" %in% names(df2)) {
          df2 <- df2 %>% filter(!is.na("timezone"))
          df2$timezone <- as.factor(df2$timezone)
          
          if (length(levels(df2$timezone)) > 1) {
            cor_val <- cor(df2[[x]], df2[[y]], use = "complete.obs")
            p.scatter <- ggplot(df2, aes(x = .data[[x]], y = .data[[y]])) +
              geom_point(aes(color = timezone)) +
              geom_smooth(method = "lm", se = FALSE) +
              labs(color = "Timezone",
                   title = paste("Scatter plot of", x, "vs", y, "by timezone in sheet", sheet,
                                 "\nCorrelation:", round(cor_val, 2)),
                   x = x, y = y)
            print(p.scatter)
          }
        }
      }
    }
  }
}